#####
# Crossing Borders: Functions
#####


#----------------------------------------
# Data Building Functions:

# compare two vectors and see the elements that appear in one but not the other.
# advantage over base: it compares both ways simultaneously.
cbPalette <- c("#332288", "#117733", "#44AA99", "#88CCEE", "#DDCC77", "#CC6677", "#AA4499", "#882255")

setdiff_dw <- function(x,y){
  in_x_not_y <- setdiff(x,y)
  in_y_not_x <- setdiff(y,x)
  return(list(in_x_not_y = in_x_not_y, in_y_not_x = in_y_not_x))
}

# calculate the modal value in a vector
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

#----------------------------------------
# Analysis Functions:

# Adjusts the DOF correctly for year + person FE
felm_cb <- function(outcome = outcome, data = cb, chatty = T, f = NULL, weights = "", control = ""){
  
  if(is.null(f)){
    f <- as.formula(paste0(outcome, " ~ transBorder15 + freeBorder15 | bfs19 + year | 0 | bfs19"))
  } 
  
  if(control == "45"){ #Control group: 30-45
    data = data %>% filter(!(travelMUNmin >= 15 & travelMUNmin < 30) & travelMUNmin < 45 & BR == 1) %>% as_tibble
  }else if(control == "60"){ #Control group: 30-60
    data = data %>% filter(!(travelMUNmin >= 15 & travelMUNmin < 30) & travelMUNmin < 60 & BR == 1) %>% as_tibble
  }else{ #Control group: 15-30
    data = data %>% filter(travelMUNmin < 30 & BR == 1) %>% as_tibble
  }
  
  if(!weights == ""){
    out = felm(f, data = data %>% filter(!is.na(popJanTotal)), weights = data %>% filter(!is.na(popJanTotal)) %>% pull(popJanTotal))
  } else {
    out = felm(f, data = data)
  }
  

  if(chatty){
    print(summary(out))
    print(summary(data %>% filter(travelMUNmin > 15) %>% pull(travelMUNmin)))
  }
  
  invisible(out)
}



#Change sample to everything below 45 minutes
felm_cb_all <- function(outcome = outcome, data = cb, chatty = T, f = NULL, weights = ""){
  
  
  if(is.null(f)){
    f <- as.formula(paste0(outcome, " ~ transBorder15 + freeBorder15 | bfs19 + year | 0 | bfs19"))
  } 
  
  if(!weights == ""){
    out <- eval(
      substitute(
        felm(
          f,
          data = data[!is.na(popJanTotal)  & BR == 1],
          w = data[!is.na(popJanTotal) & BR == 1, popJanTotal])
      )
    ) 
  } else {
    out <- eval(substitute(felm(f, data = data[ BR == 1])))
  }
  
  if(chatty){
    print(summary(out))
  }
  
  invisible(out)
}


Texreg <- function(
  ...,
  custom.coef.map = list(
    "transBorder15" = "0--15 Minutes X Transition",
    "freeBorder15" = "0--15 Minutes X Free")
  ){
  texreg(
    ...,
    booktabs = T,
    dcolumn = T,
    custom.coef.map = custom.coef.map,
   caption.above = TRUE,
   use.packages = FALSE,
   include.rsquared = FALSE,
   include.adjrs = FALSE,
   float.pos = "h!"
  )
}

# some issues with this:
# 1. Assumes that if weights, they are the weight_i_pop.
# 2. couldn't get more felxible weighting choices while having the nicely formatted reg summaries
# 3. Doesn't have functionality to include covariates
# 4. Could be made to work nicely with mcmapply (lots of regressions to fit here...)
felm_shp <- function(outcome, FE, weights = "", data = shp, chatty = F){
  
  require(lfe)
  
  FE_type <- switch(FE, "bfs19 + year", "bfs19 + year + idpers", "bfs19 + year + idhouse", "year + idpers")
  # print(FE_type)
  f <- as.formula(paste0(outcome, "~ transBorder15 + freeBorder15 | ", FE_type, " | 0 | bfs19"))
  
  # prepare weights, if necessary
  if(!weights == ""){
    out <- eval(
      substitute(
        felm(
          f,
          data = data[!is.na(weight_i_pop)  & BR == 1 & travelMUNmin <= 30],
          w = data[!is.na(weight_i_pop) & BR == 1 & travelMUNmin <= 30, weight_i_pop])
        )
      ) 
    
  } else {
    
    out <- eval(substitute(felm(f, data = data[ BR == 1 & travelMUNmin <= 30])))
  }
  
  if(chatty) print(summary(out))
  
  invisible(out)
}

# felm for the vox data
felm_vox <- function(outcome, FE = 1L, weights = "", data = vox, chatty = F, cov = F, es = F){
  
  FE_type <- switch(FE, "bfs19 + year", "bfs19 + voxnr", "bfs19 + year + voxnr")
  
  if(es){
    
    use_years <- data[BR == 1 & travelMUNmin <= 30, unique(year)]
    es_coefs <- paste0("border15_", use_years[use_years != 1999], collapse = " + ")
    f <- as.formula(paste0(outcome, " ~", es_coefs, c(" + edu3 + female + age + I(age^2)")[cov], "|", FE_type, " | 0 | bfs19"))
  
  } else {
    f <- as.formula(paste0(outcome, " ~ transBorder15 + freeBorder15", c(" + edu3 + female + age + I(age^2)")[cov], "|", FE_type, " | 0 | bfs19"))
  }
  
  if(!weights == ""){
    out <- 
      eval(
        substitute(
          felm(
            f,
            data = data[BR == 1 & travelMUNmin <= 30],
            w = data[BR == 1 & travelMUNmin <= 30, gewteil]
            )
          )
        )
  } else {
    out <-
      eval(
        substitute(
          felm(
            f,
            data = data[BR == 1 & travelMUNmin <= 30]
          )
        )
      )
  }
  
  if(chatty) print(summary(out))
  
  invisible(out)
}

# this is a function to be used inside the "j" argument of a data.table to prepare regression results for plotting in a coefficient plot. Using data.table's "by=" argument, multiple coefficients can be prepared quickly.
# Original version, 20200824, DW
# Copied from the Covid-19 Project

extractPlotInfoDT <- function(
  reg, # a regression object from insire of a regression data.table.
  coef_extract = c("transBorder15", "freeBorder15") # takes a character vector of variable names to extract or can be NA_character_ just to get the first coefficient
){
  if(class(reg) == "list"){
  model <- reg[[1]]
  } else{
    model <- reg
  }
  if(all(is.na(coef_extract))){
    est <- model$coefficients[1]
    est_name <- names(model$coefficents)[1]
    ci90 <- confint(model, level = .9)[1, , drop = F]
    ci95 <- confint(model, level = .95)[1, , drop = F]
  } else{
    keep_index <- rownames(model$coefficients) %in% coef_extract
    est <- model$coefficients[keep_index]
    est_name <- rownames(model$coefficients)[keep_index]
    ci90 <- confint(model, level = .9)[keep_index, ]
    ci95 <- confint(model, level = .95)[keep_index, ]
  }
  return(data.table(y = NA, est_name = est_name, est = est, lower95 = ci95[,1], upper95 = ci95[,2], lower90 = ci90[,1], upper90 = ci90[,2]))
}

standardize <- function(x) (x - mean(x))/sd(x)

pca_1score <- function(vars, cor_type = "poly", chatty = T){
  
  if(cor_type == "poly"){
    W <- psych::polychoric(vars)$rho
  } else if(cor_type == "mix"){
    W <- psych::mixedCor(vars)$rho
  } else if(cor_type == "pear"){
    W <- cor(vars)
  }
  
  # Note: if we want to use polychoric correlation, 
  # we cannot use methods that use SVD (i.e, prcomp()). 
  # We are left with psych::principal(), which does a lot 
  # of things and the manual version of eigen(). 
  # Principal is really just a wrapper for eigen() anyway! 
  # One thing principal does it provide the _loadings_ 
  # 
  # instead of the _coefficients_ (terminology is vague). 
  # To clarify: loadings = coefficients %*% diag(sqrt(eigenvalues)). 
  # In principal(), the eigenvalues are called "SS loadings". 
  # In prcomp() they are called "sdev" and need to be squared 
  # to provide the eigenvalues. Note that created scores using 
  # the eigenvectors or the "loadings" is irrelevant: 
  # it's just multiplying all the factors by a constant. 
  # It all goes away anyway at the end step when we standardize 
  # to sd = 1.
  
  pca <- psych::principal(r = W, nfactors = ncol(W), rotate = "none")
  if(chatty){
    print(pca)
    cat("\n")
  }
  var <- apply(as.matrix(vars), 2, standardize)
  out <- standardize(var %*% pca$loadings[,1])
  
  return(out)
}


# Plotting prep ----------------------------------------------------
coef.gg.prep <- function(model.list, regions = 1){
  
  dfplot = data.frame(
    modelcoef = c(rep(as.numeric(NA), times = 2*length(model.list))),
    ylo90 = c(rep(as.numeric(NA), times = 2*length(model.list))),
    yhi90 = c(rep(as.numeric(NA), times = 2*length(model.list))),
    ylo95 = c(rep(as.numeric(NA), times = 2*length(model.list))),
    yhi95 = c(rep(as.numeric(NA), times = 2*length(model.list)))
  )
  
  tic <- 0
  for(i in 1:length(model.list)){
    
    dfplot$modelcoef[(i + tic):(i+1 + tic)] = model.list[[i]]$coefficients[1:2]
    dfplot$ylo90[(i + tic):(i+1 + tic)] = confint(model.list[[i]], level = .9)[1:2,1]
    dfplot$yhi90[(i + tic):(i+1 + tic)] = confint(model.list[[i]], level = .9)[1:2,2]
    dfplot$ylo95[(i + tic):(i+1 + tic)] = confint(model.list[[i]])[1:2,1]
    dfplot$yhi95[(i + tic):(i+1 + tic)] = confint(model.list[[i]])[1:2,2]
    
    tic <- tic + 1
  }
  
  if(regions > 1){
    dfplot$region <- rep(c("Entire Country", "German Region", "Latin Regions"), each = 2)
  }
  
  dfplot$variable <- rep(c("Transition Period", "Free Movement"), times = regions)
  dfplot$y <- rep(2:1, times = length(model.list))
  
  return(dfplot)
}


# Function to combine arsenal tables --------------------------------------
combine_arsenal_tables <- function(tab1, tab2, head1, head2) {
  # Capture output of the tables
  tab_pre <- capture.output(tab1)
  tab_post <- capture.output(tab2)
  
  # Clean unnecessary elements
  tab_pre_clean <- tab_pre[tab_pre != "" & tab_pre != "\\end{tabular}"]
  tab_post_clean <- tab_post[tab_post != "" & tab_post != "\\begin{tabular}{l|c|c|c}"]
  
  # Custom section headers
  pre_header <- paste0("\\multicolumn{4}{c}{\\textbf{", head1, "}} \\\\ \\hline")
  post_header <- paste0("\\multicolumn{4}{c}{} \\\\ \\multicolumn{4}{c}{\\textbf{", head2, "}} \\\\ \\hline")
  
  # Add section labels
  tab_pre_labeled <- c(tab_pre_clean[1], pre_header, tab_pre_clean[-1])
  tab_post_labeled <- c(tab_post_clean[1], post_header, tab_post_clean[-1])
  
  # Combine the tables
  combined_table <- c(tab_pre_labeled, "", tab_post_labeled)
  
  return(combined_table)
}